{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "83a41ac9",
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入相关包\n",
    "import pandas as pd \n",
    "import numpy as np \n",
    "import matplotlib.pyplot as plt \n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.model_selec\n",
    "\n",
    "\n",
    "tion  import train_test_split\n",
    "from sklearn import linear_model\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn import tree\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "3005cbd4",
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取相关文件\n",
    "train_path = '../data/train.csv'\n",
    "test_path = '../data/test.csv'\n",
    "submit_path = '../data/车辆贷款违约预测挑战赛sample_submit.csv'\n",
    "train_data = pd.read_csv(train_path)\n",
    "test_data = pd.read_csv(test_path)\n",
    "submit_data = pd.read_csv(submit_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "856489da",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5000002.0\n"
     ]
    }
   ],
   "source": [
    "inf_count = np.isinf(train_data['outstanding_disburse_ratio']).value_counts()[1]\n",
    "notna_max = np.sort(train_data['outstanding_disburse_ratio'].values)[-(inf_count+1)]+1\n",
    "print(notna_max)\n",
    "#train_data['outstanding_disburse_ratio'][np.isinf(train_data['outstanding_disburse_ratio'])] = np.nan\n",
    "train_data[np.isinf(train_data)] = notna_max"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "2f531a48",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2500002.0\n"
     ]
    }
   ],
   "source": [
    "inf_count = np.isinf(test_data['outstanding_disburse_ratio']).value_counts()[1]\n",
    "notna_max = np.sort(test_data['outstanding_disburse_ratio'].values)[-(inf_count+1)]+1\n",
    "print(notna_max)\n",
    "#train_data['outstanding_disburse_ratio'][np.isinf(train_data['outstanding_disburse_ratio'])] = np.nan\n",
    "test_data[np.isinf(test_data)] = notna_max"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "29ad044c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#生成提交文件\n",
    "submit_data['customer_id'] = test_data['customer_id']\n",
    "submit_data['loan_default'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "dbf08085",
   "metadata": {},
   "outputs": [],
   "source": [
    "#删除特征\n",
    "train_data = train_data.drop(['year_of_birth','customer_id','mobileno_flag', 'idcard_flag', 'disbursed_date'],axis=1)\n",
    "test_data = test_data.drop(['year_of_birth','customer_id','mobileno_flag', 'idcard_flag', 'disbursed_date'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "b9882367",
   "metadata": {},
   "outputs": [],
   "source": [
    "#划分数据集\n",
    "labels = train_data['loan_default'].values\n",
    "features_scaler = train_data.drop(['loan_default'],axis=1).values\n",
    "X_train,X_test, y_train, y_test = train_test_split(features_scaler, labels, test_size = 0.2, random_state = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "26d1f570",
   "metadata": {},
   "outputs": [],
   "source": [
    "#标准化特征值\n",
    "sc = StandardScaler()\n",
    "sc.fit(X_train)\n",
    "X_train_std = sc.transform(X_train)\n",
    "X_test_std = sc.transform(X_test)\n",
    "\n",
    "#标准化特征值(测试数据)\n",
    "sc = StandardScaler()\n",
    "sc.fit(test_data)\n",
    "test_data_std = sc.transform(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "9053211c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_f1(model):\n",
    "    y_pred = model.predict(X_test_std)\n",
    "    y_true = y_test\n",
    "    score = f1_score(y_true, y_pred, average='macro')\n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "d6ecb182",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "f1: 0.5233497199406105\n"
     ]
    }
   ],
   "source": [
    "#训练决策树模型\n",
    "model = tree.DecisionTreeClassifier()\n",
    "model = model.fit(X_train_std, y_train)\n",
    "print(\"f1:\",calculate_f1(model))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "040bdef5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}\n"
     ]
    }
   ],
   "source": [
    "print(model.get_params())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "16b206f5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "预测逾期人数： 8791\n"
     ]
    }
   ],
   "source": [
    "result =(model.predict(test_data_std))\n",
    "print(\"预测逾期人数：\",np.sum(result))\n",
    "submit_data['loan_default'] = result\n",
    "submit_data.to_csv('../data/submit/decisiontree.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "d6e9f8f5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "最优分类器: {'criterion': 'gini', 'max_depth': 10, 'min_impurity_decrease': 0.01, 'min_samples_leaf': 1} 最优分数: 0.8235166666666668\n"
     ]
    }
   ],
   "source": [
    "# 用GridSearchCV寻找最优参数（字典）\n",
    "param = {'criterion':['gini'],'max_depth':[10,20,30,50],'min_samples_leaf':[1,2,3,5,10,15],'min_impurity_decrease':[0.01,0.05,0.1,0.2,0.5]}\n",
    "grid = GridSearchCV(tree.DecisionTreeClassifier(),param_grid=param,cv=6)\n",
    "grid.fit(X_train_std, y_train)\n",
    "print('最优分类器:',grid.best_params_,'最优分数:', grid.best_score_)  # 得到最优的参数和分值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "b5d8adaf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "f1: 0.4508813354565922\n"
     ]
    }
   ],
   "source": [
    "#训练决策树模型\n",
    "model = tree.DecisionTreeClassifier(criterion='gini', max_depth= 100, min_impurity_decrease=0.01, min_samples_leaf= 1)\n",
    "model = model.fit(X_train, y_train)\n",
    "print(\"f1:\",calculate_f1(model))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "641a6929",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练集分数： 0.8235166666666667 测试集分数 0.8211\n"
     ]
    }
   ],
   "source": [
    "# 用最佳的参数训练\n",
    "model = tree.DecisionTreeClassifier(criterion='gini', max_depth= 50, min_impurity_decrease=0.1, min_samples_leaf= 2)\n",
    "model = model.fit(X_train, y_train)\n",
    "y_pred = model.predict(X_test_std)\n",
    "print('训练集分数：', model.score(X_train, y_train),'测试集分数',model.score(X_test,y_test))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
